{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn import metrics\n",
    "import time\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the shape of train_image: (13418, 110)\n"
     ]
    }
   ],
   "source": [
    "#读取训练数据和测试数据\n",
    "train = pd.read_csv('Select_events.csv')\n",
    "\n",
    "#X_train = train.drop(\"label\",axis=1).values\n",
    "print('the shape of train_image: {}'.format(train.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train2 = train.drop(['event_id','user_id','start_time','city','state','zip','country','lat','lng'], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能\n",
    "def K_cluster_analysis(K, X_train):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    #mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans = KMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(X_train)\n",
    "    \n",
    "    # 在训练集和测试集上测试\n",
    "    #y_train_pred = mb_kmeans.fit_predict(X_train)\n",
    "    #y_val_pred = mb_kmeans.predict(X_val)\n",
    "    \n",
    "    #以前两维特征打印训练数据的分类结果\n",
    "    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)\n",
    "    #plt.show()\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))\n",
    "    \n",
    "    #也可以在校验集上评估K\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "    #print(\"v_score: {}\".format(v_score))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 2\n",
      "CH_score: 0.9939034900633691, time elaps:16\n",
      "K-means begin with clusters: 3\n",
      "CH_score: 0.7365983342233503, time elaps:16\n",
      "K-means begin with clusters: 5\n",
      "CH_score: 0.5831866399543385, time elaps:15\n",
      "K-means begin with clusters: 8\n",
      "CH_score: 0.523192268696927, time elaps:15\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 0.503387175861415, time elaps:15\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.33964535029001663, time elaps:15\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.2781282700176647, time elaps:16\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.23751410225209155, time elaps:17\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.23735455737796587, time elaps:18\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.2226651970744288, time elaps:19\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.1948738307957061, time elaps:21\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.19112874705387248, time elaps:22\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.18899401080433964, time elaps:21\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.17563806588081465, time elaps:23\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "#10 到 60， 10 为 0.4068 \n",
    "Ks = [ 2,3,5,8,10, 20, 30,40,50,60, 70, 80, 90, 100]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, train2)\n",
    "    CH_scores.append(ch)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7ff22b3029b0>]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAGKxJREFUeJzt3X+cVHW9x/HXZ5cF5IchQoiALOLysyR0VdKumnITuQVimHBvaaZSJmmmlaYPK66pmY+bWlwLf5s3FfEXqWH5IzUfgixWJiC6gMqKyWoqivJj4XP/+M62w+7M7uwys2fOmffz8ZjHzDnzndnP8eB7z37P93yPuTsiIpIsZVEXICIi+adwFxFJIIW7iEgCKdxFRBJI4S4ikkAKdxGRBFK4i4gkkMJdRCSBFO4iIgnUJaof3K9fP6+srIzqx4uIxNKyZcvecvf+bbWLLNwrKyupqamJ6seLiMSSmb2aSzt1y4iIJJDCXUQkgRTuIiIJ1Ga4m9mNZrbBzF7I8r6Z2TVmVmtmz5vZAfkvU0RE2iOXI/ebgUmtvH8sUJV6zAKu3fWyRERkV7QZ7u7+JPDPVppMBW71YDHQx8wG5qtAERFpv3z0uQ8C1qUt16XWiYhIRPIR7pZhXcZ795nZLDOrMbOa+vr6Dv2wP/8ZLrwQtm/v0MdFREpCPsK9DhiStjwYWJ+pobvPc/dqd6/u37/NC6wyWrIELr0UNm3q0MdFREpCPsJ9IXBSatTMBOA9d38jD9+bUe/e4fn99wv1E0RE4q/N6QfM7HbgSKCfmdUBPwQqANz9V8BDwGSgFvgQOKVQxYLCXUQkF22Gu7vPbON9B87MW0VtULiLiLQtdleoNob7Bx9EW4eISDGLXbj36hWedeQuIpJd7MJd3TIiIm1TuIuIJJDCXUQkgWIX7j16gJlOqIqItCZ24V5WBj176shdRKQ1sQt3CF0zCncRkewU7iIiCRTbcFefu4hIdrEM9169dOQuItKaWIa7umVERFqncBcRSSCFu4hIAsU23HVCVUQku1iGe69e4TZ7O3ZEXYmISHGKZbhrTncRkdbFOtzV7y4ikpnCXUQkgWIZ7o13Y1K3jIhIZrEMdx25i4i0TuEuIpJACncRkQRSuIuIJFAsw10nVEVEWhfLcO/ZMzzryF1EJLNYhntZmeZ0FxFpTSzDHTQzpIhIa2Ib7jpyFxHJLrbhrml/RUSyi3W468hdRCQzhbuISAIp3EVEEii24d6rl/rcRUSyiW2468hdRCS7WIe77qMqIpJZrMMd1DUjIpJJbMO9cfIwdc2IiLQU23DXkbuISHY5hbuZTTKzVWZWa2bnZ3h/HzN73Mz+YmbPm9nk/Je6M83pLiKSXZvhbmblwFzgWGAMMNPMxjRrdhEw393HAzOA/813oc0p3EVEssvlyP1goNbd17j7VuAOYGqzNg7snnr9MWB9/krMTOEuIpJdlxzaDALWpS3XAYc0a/Mj4A9m9i2gJzAxL9W1QidURUSyy+XI3TKs82bLM4Gb3X0wMBn4jZm1+G4zm2VmNWZWU19f3/5q0+iEqohIdrmEex0wJG15MC27XU4F5gO4+zNAd6Bf8y9y93nuXu3u1f379+9YxSnqlhERyS6XcF8KVJnZMDPrSjhhurBZm9eAowHMbDQh3Hft0LwNuo+qiEh2bYa7uzcAs4GHgZWEUTHLzWyOmU1JNTsXON3M/gbcDnzV3Zt33eRVWVkIeIW7iEhLuZxQxd0fAh5qtu7itNcrgMPyW1rbNHmYiEhmsb1CFXSrPRGRbGIf7jpyFxFpSeEuIpJAsQ73Xr0U7iIimcQ63NXnLiKSWezDXUfuIiItKdxFRBIo9uGu+6iKiLQU63BvnBlS/e4iIjuLdbhrZkgRkcxiHe577BGed3H2YBGRxIl1uI8aFZ5Xroy2DhGRYhPrcB85EsrLYcWKqCsRESkusQ73bt1gv/1g+fKoKxERKS6xDneAMWMU7iIizcU+3MeOhdpa2LIl6kpERIpHIsJ9+3ZYtSrqSkREikciwh3UNSMiki724T5iRBgxo3AXEWkS+3Dv1g2qqhTuIiLpYh/uELpmFO4iIk0SE+6rV8PmzVFXIiJSHBIT7jt2wIsvRl2JiEhxSEy4g7pmREQaJSLcq6qgSxfNMSMi0igR4d61q0bMiIikS0S4g0bMiIikS1S4r14NH30UdSUiItFLVLi7a8SMiAgkLNxBXTMiIpCgcK+qgooKhbuICCQo3CsqwiRiCncRkQSFO2jEjIhIo8SF+9q18OGHUVciIhKtxIW7O6xcGXUlIiLRSly4g6YhEBFJVLgPH64RMyIikLBwr6iAkSMV7iIiiQp30IgZERFIaLivXQubNkVdiYhIdHIKdzObZGarzKzWzM7P0uZLZrbCzJab2W/zW2buGk+qasSMiJSyNsPdzMqBucCxwBhgppmNadamCrgAOMzdxwLfLkCtOdEcMyIiuR25HwzUuvsad98K3AFMbdbmdGCuu78D4O4b8ltm7oYPDzfvULiLSCnLJdwHAevSlutS69KNAEaY2dNmttjMJmX6IjObZWY1ZlZTX1/fsYrb0KULjBqlcBeR0pZLuFuGdd5suQtQBRwJzASuN7M+LT7kPs/dq929un///u2tNWcaMSMipS6XcK8DhqQtDwbWZ2hzv7tvc/e1wCpC2Edi7Fh49VX44IOoKhARiVYu4b4UqDKzYWbWFZgBLGzW5j7gswBm1o/QTbMmn4W2x5jU6V6NmBGRUtVmuLt7AzAbeBhYCcx39+VmNsfMpqSaPQy8bWYrgMeB77r724Uqui2f+lR4XrAgqgpERKJl7s27zztHdXW119TUFOz7v/Y1uPVWqKlpCnsRkbgzs2XuXt1Wu8RdodroyiuhXz847TRoaIi6GhGRzpXYcO/bF37xC1i2DK66KupqREQ6V2LDHWD6dJg6FS6+GFavjroaEZHOk+hwN4O5c8NUwLNmhbs0iYiUgkSHO8CgQXDFFfDYY3DTTVFXIyLSORIf7gCnnw6HHw7nngv/+EfU1YiIFF5JhHtZGVx3HXz0EXzrW1FXIyJSeCUR7gAjRsAPfxgubLrvvqirEREprJIJd4DzzoNx4+Cb34R33426GhGRwimpcK+ogOuvhzffhO9/P+pqREQKp6TCHaC6Gs45B+bNgyeeiLoaEZHCKLlwB5gzB/bdN4yi+eijqKsREcm/kgz3Hj3g17+Gl1+G2bNh27aoKxIRya+SDHeAiRPhggvgxhvhqKNgffPbj4iIxFjJhjvApZfCb38Lzz0H48fD449HXZGISH6UdLgDzJwJS5eGWSQnToTLLoMdO6KuSkRk15R8uEO4Ld+zz8IJJ8APfhBmknznnairEhHpOIV7Su/ecPvtcM01sGgRHHoovPde1FWJiHSMwj2NWZh7ZtGiMJLm5JPVRSMi8aRwz+Doo8Nt+u6/Hy6/POpqRETaT+Gexdlnw4wZcNFF8Ic/RF2NiEj7KNyzMAvz0IwdG0bUvPJK1BWJiORO4d6Knj3hnnugoQG++EVNVSAi8aFwb0NVFdx2W7jQ6cwzdR9WEYkHhXsOvvCF0Pd+001hNkkRkWKncM/Rj34ExxwThkouWRJ1NSIirVO456i8PMxDM2hQ6H/fsCHqikREslO4t0PfvuEE69tvw4knhhOtIiLFSOHeTuPHw69+BX/6U5gyWESkGCncO+Dkk+GMM8JVrHfdFXU1IiItKdw76KqrYMIEOOUUWLEi6mpERHamcO+grl1hwYJwodO0abBxY9QViYg0UbjvgkGDYP58WL0avvpVXeAkIsVD4b6LjjgCrrgC7r0XfvrTqKsREQkU7nlwzjlhaOSFF8Ijj0RdjYiIwj0vGmeQHD06TBP86qtRVyQipU7hnie9eoULnLZtC1ewbt4cdUUiUsoU7nk0YgTceissWwazZ0ddjYiUMoV7nk2dCj/4AdxwA1x3XdTViEipyinczWySma0ys1ozO7+VdtPNzM2sOn8lxs+cOfC5z4Wj92efjboaESlFbYa7mZUDc4FjgTHATDMbk6Fdb+AsoOQnxG2cQXLgQJg+Herro65IREpNLkfuBwO17r7G3bcCdwBTM7T7b+AKQKcSgT33DCdYN2wII2g0g6SIdKZcwn0QsC5tuS617l/MbDwwxN0fyGNtsXfAAXDttfDYY2EMvIhIZ8kl3C3Dun9daG9mZcDPgXPb/CKzWWZWY2Y19SXSV3HKKfD1r4erWO++O+pqRKRU5BLudcCQtOXBwPq05d7AJ4A/mdkrwARgYaaTqu4+z92r3b26f//+Ha86Zq6+Gg45JMw/s3Jl1NWISCnIJdyXAlVmNszMugIzgIWNb7r7e+7ez90r3b0SWAxMcfeaglQcQ926hRkkd9sNjj8e3n8/6opEJOnaDHd3bwBmAw8DK4H57r7czOaY2ZRCF5gUgwfDnXfCSy+FrhrNICkihWQeUcpUV1d7TU3pHdxfeSV897thBsnvfS/qakQkbsxsmbu3eS2RrlDtZOeeCyecEO6/+uijUVcjIkmlcO9kZmFqgpEjw/j3116LuiIRSSKFewR69w4399iyJVzBqhkkRSTfFO4RGTkSbrkFli6Fs86KuhoRSRqFe4SmTYPzzw+zR95wQ9TViEiSKNwjdsklMHEinHkmlODgIREpEIV7xMrL4fbbYcCAcAent96KuiIRSQKFexHo1y/MO/PmmzBzJmzfHnVFIhJ3CvciUV0Nc+fCI4/Ad74TRtKIiHSUwr2InHoqnHEGXHMNDB8OV10FmzZFXZWIxJHCvcjMnQsPPxzC/ZxzoLISfvITePfdqCsTkThRuBcZs3D/1SeegKeegoMOgosugqFDw423N2yIukIRiQOFexH7zGfgoYfguedC4F9+eTiSP/tsWLeuzY+LSAlTuMfA+PFw112wYgV86Uuh62b4cDjtNKitjbo6ESlGCvcYGTUKbr45BPqsWXDbbWEag5kz4e9/j7o6ESkmCvcYqqyEX/4SXnkFzjsPHngA9t8fpkyBJUuirk5EioHCPcb22ivc9OPVV+HHP4ann4YJE+Doo+Gxx3S3J5FSpnBPgL594eKLQ8hfeWXomz/6aPj0p+F3v1PIi5QihXuC9OoV7vS0di1ce22YzmDKFBg3Du64Q9MaiJQShXsCde8O3/hGuBn3rbdCQ0M46TpqVJhaeOvWqCsUkUJTuCdYRQV85SvwwgthYrLddw/DJ4cPD1McfPhh1BWKSKEo3EtAWRkcf3yYL37RIhg2LFwIVVkJl10G770XdYUikm8K9xJiBsccA08+GR4HHhimNBg6NExxUF8fdYUiki8K9xL1b/8Gv/89LFsW7gR16aXhSH727DCnjU6+isSbwr3EHXAALFgAy5fD9Onhfq6HHw4DB4b++QcfhM2bo65SRNpL4S4AjB4Nt9wSbvN3551hnPz8+fD5z0P//nDiiWE45caNUVcqIrkwj+gKl+rqaq/RHaGL2pYt4UrXe++F++8P0w137RqCf9q0MIZ+wICoqxQpLWa2zN2r22yncJdcbN8OixeHoL/3XlizJpygPfTQEPTTpsG++0ZdpUjyKdylYNzDLJSNQf+3v4X1++/fFPT77x/CX0TyS+EunWbNGrjvvhD0Tz8dwn/YMDjuuBD0hx4K5eVRVymSDAp3icSGDbBwYQj6Rx4JUx18/OOhf37atNBf361b1FWKxJfCXSK3cWMYS3/vveF2ge+/D717w+TJ4ah+8uQwJYKI5E7hLkVlyxZ49NGmkTf19Rp5I9IRCncpWtu3wzPPNJ2QXbs2nHzdf/8wo2Wj9BOyu/I6l3YVFTBoEAwZEh777NP0unfv9m+jSKEo3CUW3OH550PIP/ss7NjRtD69TUdf59pu82Z4/XVYv77l5z/2sZ3Dvnn4Dx6s8wjSeXIN9y6dUYxINmbhZiLjxkVdSbBtWwj4devC47XXml6vWxfuUfv22y0/N2BA5uBvXN5rL40Yks6lcBdJU1ERZskcOjR7mw8/hLq6zOG/ahX88Y/wwQc7f6ZLF9h778zBX1kZpn+oqCjopkmJUbiLtFOPHjBiRHhk4h7myG/t6P/uu3e+I1b37mEK5gkT4JBDwmPIEF0IJh2nPneRCOzYEUYMrVsHtbXhfMOSJWEK5i1bQpuBA5uCfsIEqK4O98mV0pbXE6pmNgm4GigHrnf3y5u9/x3gNKABqAe+5u6vtvadCneRlrZuDSeYFy8OYb9kCbz8cnivrAzGjt356H70aPXll5q8hbuZlQMvAf8O1AFLgZnuviKtzWeBJe7+oZmdARzp7ie29r0Kd5HcvP1205H94sXh9TvvhPd694aDDmo6uj/kEF0vkHT5HC1zMFDr7mtSX3wHMBX4V7i7++Np7RcDX25fuSKSzZ57wrHHhgeEPv2XX9756P5nP4OGhvD+0KFNQT9hAowfv/P1A1Iacgn3QcC6tOU64JBW2p8K/H5XihKR7MyaTuiedFJY99FH8NxzTUf3zzwTbroCYRTOuHE7H93vt59O1iZdLuGe6Z9Axr4cM/syUA0ckeX9WcAsgH322SfHEkWkLbvtBocdFh6N3nij6ch+8WK4+WaYOze817dvCPkDDwwXaXXvHi7E6tat6XWuz1005q4o5bJb6oAhacuDgfXNG5nZROBC4Ah335Lpi9x9HjAPQp97u6sVkZwNHBgmaDvuuLC8fTusWLFzd86iRS2vyG2vsrLcfxE0f11RER5dujQ9OmO5oiJMWpfkv15yCfelQJWZDQNeB2YA/5newMzGA78GJrn7hrxXKSK7rLwcPvnJ8Dj99LBu27Yw9cKWLeHR+Lq9z2212bgx8/qGhp0fjdNPdIZu3cKFZXvvHeYVav7c+LpHj86rKZ/aDHd3bzCz2cDDhKGQN7r7cjObA9S4+0LgZ0Av4C4Lvwpfc/cpBaxbRPKg8Si2WCZH27Fj57Dftq3lL4Dm63Jp03zdli3w5ptN8wn99a/w4IOwaVPLmvr0yR78jc8DBhRf91RO5bj7Q8BDzdZdnPZ6Yp7rEpESVFYWpoLu2jWan79xY1Pgpz83vl65MpzL2L69Zd177ZX9r4DG5z59Oq8rqMh+14iIRGf33cNj9OjsbbZvD1cXZ/slsGYNPPUU/POfLT+7224h6C+5BGbMKNx2gMJdRKRdysvDUfpee4XRRtls3twy/Buf+/cvfJ0KdxGRAujeHfbdNzyiUBbNjxURkUJSuIuIJJDCXUQkgRTuIiIJpHAXEUkghbuISAIp3EVEEkjhLiKSQJHdINvM6oFW77MK9APe6oRyio22u7SU6nZD6W77rmz3UHdv8xrXyMI9F2ZWk8u9ApNG211aSnW7oXS3vTO2W90yIiIJpHAXEUmgYg/3eVEXEBFtd2kp1e2G0t32gm93Ufe5i4hIxxT7kbuIiHRA0Ya7mU0ys1VmVmtm50ddT6GY2RAze9zMVprZcjM7O7W+r5n90cxeTj3vEXWt+WZm5Wb2FzN7ILU8zMyWpLb5TjOL6GZrhWVmfcxsgZm9mNrvny6R/X1O6t/4C2Z2u5l1T+I+N7MbzWyDmb2Qti7j/rXgmlTOPW9mB+SrjqIMdzMrB+YCxwJjgJlmNibaqgqmATjX3UcDE4AzU9t6PvCou1cBj6aWk+ZsYGXa8k+Bn6e2+R3g1EiqKryrgUXuPgoYR/hvkOj9bWaDgLOAanf/BFAOzCCZ+/xmYFKzddn277FAVeoxC7g2X0UUZbgDBwO17r7G3bcCdwBTI66pINz9DXd/LvX6fcL/6IMI23tLqtktwHHRVFgYZjYY+A/g+tSyAUcBC1JNErfNAGa2O3A4cAOAu29193dJ+P5O6QLsZmZdgB7AGyRwn7v7k0DzO6hm279TgVs9WAz0MbOB+aijWMN9ELAubbkutS7RzKwSGA8sAQa4+xsQfgEAH4+usoK4CvgesCO1vCfwrrs3pJaTus/3BeqBm1JdUtebWU8Svr/d/XXgSuA1Qqi/ByyjNPY5ZN+/Bcu6Yg13y7Au0cN6zKwXcDfwbXffGHU9hWRmnwc2uPuy9NUZmiZxn3cBDgCudffxwCYS1gWTSaqPeSowDNgb6Enokmguifu8NQX7d1+s4V4HDElbHgysj6iWgjOzCkKw/5+735Na/Wbjn2ep5w1R1VcAhwFTzOwVQpfbUYQj+T6pP9khufu8Dqhz9yWp5QWEsE/y/gaYCKx193p33wbcAxxKaexzyL5/C5Z1xRruS4Gq1Jn0roQTLwsjrqkgUn3NNwAr3f1/0t5aCJycen0ycH9n11Yo7n6Buw9290rCvn3M3f8LeByYnmqWqG1u5O7/ANaZ2cjUqqOBFSR4f6e8Bkwwsx6pf/ON2534fZ6Sbf8uBE5KjZqZALzX2H2zy9y9KB/AZOAlYDVwYdT1FHA7P0P4M+x54K+px2RCH/SjwMup575R11qg7T8SeCD1el/gWaAWuAvoFnV9BdrmTwE1qX1+H7BHKexv4MfAi8ALwG+Abknc58DthPMK2whH5qdm27+Ebpm5qZz7O2E0UV7q0BWqIiIJVKzdMiIisgsU7iIiCaRwFxFJIIW7iEgCKdxFRBJI4S4ikkAKdxGRBFK4i4gk0P8DYR/Xx6AlgLgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7ff22b39bd68>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "小结：\n",
    "筛选 train 和test中出现的活动做聚类， 共有 13418条，\n",
    "如果采用 MiniBatchKMeans，样本数 太少，而聚类类别多的话，最后的结果CH_score 会 不稳定，每次运行的变化大\n",
    "如果采用KMeans， 样本数目还不算很大， 运行时间 比 MiniBatchKMeans多， 大牛市 CH_score 会稍稳定一点\n",
    "\n",
    "同时，最后得出的分类效果是 分 2类时， 聚类效果特别好，对比 题目中 要求 对 事件 是否 感兴趣 分类， \n",
    "如果最后结果 能和 用户的 感兴趣 程度 相吻合， 那么  这个聚类 可能就是 成功的，\n",
    "如果 最后聚类的结果 和 用户的感兴趣程度 不吻合， 那么 分 2类 可能就是 分得太粗糙了，\n",
    "不过 单纯通过CH_score 是无法 判断是不是 合适的分类\n",
    "\n",
    "从上面的图像，在分 2 类 到 10类，CH_score就急剧下降，只能说明 可能原本数据 就是 比较分2类的\n",
    "再通过分两类时 CH_score: 0.9939， 完全符合球型高斯？ 或者 就是 因为 维数太高，分类 不准确。。\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
